########################################################
## This code traines the models using pooled data from several years.
########################################################

########################################################
## Preparation of the workspace
########################################################

## remove all objects from the current workspace
rm(list=ls())

## load the required packages
library(haven)
library(caret)
library(randomForest)
library(doParallel)
library(mice)
library(plyr)
library(dplyr)
library(VIM)
library(base)
library(ranger)
library(glmnet)
library(xgboost)

## display the current time -> to check how much time it takes to run the code
start_time = Sys.time()

## set the directories
main <- "placeholder_main"

dataDirectory <- paste0(main, "/Data")
RDirectory <- paste0(main, "/Programs/Output Generation")

## Load the tuning and training functions:
source(paste0(RDirectory, "/102_0_caret_parameter_tuning_Function.R"))
source(paste0(RDirectory, "/103_0_caret_predictions_Function.R"))


########################################################
## Define the locals for the loop
########################################################
 
# Define the model that is used (the name of the model is a part of the name of the .dta file with data)
 model_list <- c("Full_Pooled")

# Define the outcome variables that are predicted
dependent_variable <- c("emplAft6M_0M_In", "emplAft6M_6M_In", "emplAft6M_12M_In")

# Define the time periods:
years <- list("2006_2007", "2009_2010")

########################################################
## Running the loop
########################################################   

for (model in model_list) {
  
  for (y in years) {
    
    for (dependent in dependent_variable){
      
      if (dependent == "emplAft6M_0M_In") {
        # Load the dataset:
        data <- read_dta(paste0(dataDirectory, "/002_DataForR_", model, "_", y,".dta"))
        
        
        # Run the tuning function:
        parameters <- tuning(data = data, dependent = dependent,
                             s_tuning = 0.1, seed = 2111, noisily = TRUE)
        
        write.csv(parameters$rfgrid_final, file = paste0(dataDirectory,"/102_rfgrid_" , model,"_", dependent, "_", y, ".csv"), row.names = FALSE)
        write.csv(parameters$boostgrid_final, file = paste0(dataDirectory,"/102_boostgrid_", model, "_", dependent, "_", y, ".csv"), row.names = FALSE)
        write.csv(parameters$lassogrid_final, file = paste0(dataDirectory,"/102_lassogrid_", model, "_", dependent, "_", y, ".csv"), row.names = FALSE)
        write.csv(parameters$rfgrid_search, file = paste0(dataDirectory,"/102_rfgrid_search_" , model,"_", dependent, "_", y, ".csv"), row.names = FALSE)
        write.csv(parameters$boostgrid_search, file = paste0(dataDirectory,"/102_boostgrid_search_", model, "_", dependent, "_", y, ".csv"), row.names = FALSE)
        write.csv(parameters$lassogrid_search, file = paste0(dataDirectory,"/102_lassogrid_search_", model, "_", dependent, "_", y, ".csv"), row.names = FALSE)
        
        # Run the prediction function
        results <- estimating(data = data, dependent = dependent,
                              parameters = parameters,
                              s_tuning = 0.1, s_training = 0.3, seed = 2111, noisily = TRUE)
        
      } else {
        results <- estimating(data = data, dependent = dependent,
                              parameters = parameters,
                              s_tuning = 0, s_training = 0.4, seed = 2111, noisily = TRUE)
      }      
      
      # Save models
      models <- results$models
      save(models, 
           file=paste0(dataDirectory,"/103_Models_", model,"_",dependent, "_", y, ".rda"))
      
      # Save predictions
      write.csv(results$output, 
                file=paste0(dataDirectory,"/103_predictionsR_", model,"_",dependent, "_", y, ".csv"))
      
    }
  }
} 


##################################################
## Creating predictions for people unemployed in X month using Y month model
########################################################   

for (model in model_list) {
  
  for (y in years) {
    
    for (dependent in dependent_variable){
        
        print(paste(dependent,"Start")) ## display the time when loop starts for a year and y variable
        
        ########################################################
        ## Load the dataset
        ##################################################
        data <- read_dta(paste0(dataDirectory, "/002_DataForR_", model, "_", y,".dta"))
      
        ########################################################
        ## Define the sub-sample for creation of predictions
        ########################################################
        
        ## Keeping only 30% of sample for creation of predictions
        n_parametertuning <- round(nrow(data)*0.1, digits=0) + 1
        n_training <- round((nrow(data)*0.3), digits=0)
        n_training_plus_tuning <- n_training + n_parametertuning
        n_training_plus_tuning_1 <- n_training_plus_tuning + 1
        
        first_column <- which(colnames(data)=="Gender") ## Identify column number where the variables of interest start
        
        ## Keeping only those observations with non-missing outcome variables
        y_column <- which(colnames(data) == dependent) ## Identify column number with the dependent variable of interest
        data <- data[complete.cases(data[, y_column]), ] ## Restrict the dataset to observations with non-missing dependent variable
        
        ## Creating dataset for training and predictions
        data_pred <- data[data$n_order >= n_training_plus_tuning_1,] ## Data for predictions (hold-out sample)
        
        total_y <- factor(data_pred[[dependent]]) ## declare the y variable as a factor/category
        total_x <- data_pred[, first_column:ncol(data_pred)] ## Keeping all the possible covariates we want to have in the model
        total_final <- as.data.frame(cbind(total_y, total_x)) ## Creating final dataset used for predictions
        
        persinfo <- as.data.frame(cbind(data_pred$LopNr_PersonNr, data_pred$n)) ## Creating a dataset with individual ID and n for the predictions
        
        ## Correcting format of outcome variables
        levels(total_final$total_y) <- c("no", "yes")
        
        ## Removes the initial dataset that was loaded and the other intermediate datasets
        rm("data")
        rm("data_pred")
        rm("total_final")
        
        ##
    
        for (unempl in c(0, 6, 12)) {
          
          if (dependent !=paste0("emplAft6M_",unempl,"M_In")) {
            
            ########################################################
            ## Load models
            ########################################################    
            
            load(file=paste0(dataDirectory,"/103_Models_", model, "_emplAft6M_", unempl, "M_In", "_", y, ".rda"))
            
            ########################################################
            ## Create predictions
            ########################################################
            
            ## Creating predictions Random Forest
            prob_rf <- predict(models$rff_final, total_x)
            prob_rf <- prob_rf[["predictions"]]
            prob_rf = data.frame(prob_rf[,2])
            
            ## Creating predictions Gradient Boost
            total_x_boost = data.matrix(total_x)
            prob_boost <- predict(models$rboost_final, total_x_boost)
            rm("total_x_boost") ## remove the data format that is specific gradient boost
            
            ## Creating predictions LASSO
            total_x_lasso = data.matrix(total_x)
            prob_lasso <- predict(models$rlasso_final, total_x_lasso, type = "response")
            rm("total_x_lasso") ## remove the data format that is specific to lasso
            
            ########################################################
            ## Save predictions
            ########################################################
            output <- cbind(total_y, prob_rf, prob_boost, prob_lasso, persinfo) ## put all the predictions together, with personal ID, n and outcome variable
            write.csv(output, file=paste0(dataDirectory,"/103_predictionsR_", model,"_", dependent, "_", y,"_emplAft6M_",unempl,"M_In_Model.csv"))
            rm("output") ## remove the dataset with output
            
            print(paste(dependent,"End")) ## display the time when loop ends for a year and y variable
            
            
            
          }
          
        }
        
    }    
      
  }
      
}
    

  



